View(house)
empty_list <- rep(0, length(house[1,]) - 1)
NAcounts <- data.frame(empty_list, empty_list, empty_list)
colnames(NAcounts) <- c('Column', 'NA_Counts', 'NA_Ratio')
for (i in 1:length(house[1,])) {
NAcounts[i,1] <- as.character(colnames(house))[i]
NAcounts[i,2] <- sum(is.na(house[,i]))
NAcounts[i,3] <- mean(is.na(house[,i]))
}
# View(NAcounts)
datatable(NAcounts)
# HouseSummary <- cbind(NAcounts, empty_list)
# colnames(HouseSummary) <- c('Column', 'NA_Counts', 'NA_Ratio', 'Type')
#
# house_t <- tibble(house)
#
# for (i in 1:length(house[1,])) {
# HouseSummary[i,4] <- typeof(house_t[,i])
# }
# View(HouseSummary)
# This didn't work for me.
str(house)
# PoolQC is mostly NAs
table(house$PoolQC)
# MiscFeature is also a rare occurance
table(house$MiscFeature)
# This is good for now
table(house$MiscVal)
# I could label encode this
table(house$Alley)
# Label encode
table(house$Fence)
# Grab values
table(house$FireplaceQu) # this is better than using unique() because I can check the counts
# This is quantitative. "Linear feet of street connected to property"
# What does an NA mean? Probably just missing. There are enough NAs that I don't want to drop this column.
# I'm going to impute the median.
unique(house$LotFrontage)
lotfront_med <- median(drop_na(house, LotFrontage)$LotFrontage)
# I would like to just collapse it to a binary 'HasGarage'. But I want this to be quality data.
# Note: In every place that these are NA it means there is no garage.
table(house$GarageQual) # some are VERY short on observations
table(house$GarageCond) # some are VERY short on observations
table(house$GarageType) # some of these are real short on observations
table(house$GarageYrBlt) # Not sure how to deal with this one. I'll match the year built date.
table(house$GarageFinish) # just fine
# Will assume NAs are 'None' and 0, respectively
table(house$MasVnrType)
table(house$MasVnrArea)
# Hmm. And this has one NA.
table(house$Electrical)
# These are factors
table(house$MSSubClass)
palette(c("red","red","red","blue","blue","orange","orange","black","blue","blue","red","red","blue","orange","blue","blue"))
plot(log(SalePrice) ~ MSSubClass, data = house, col = factor(MSSubClass))
plot(log(SalePrice) ~ YearBuilt, data = house, col = factor(MSSubClass))
palette("default")
plot(log(SalePrice) ~ YearBuilt, data = house, col = factor(MSSubClass))
house2 <- house %>%
mutate(HasPool = !is.na(PoolQC)) %>%
select(-PoolQC) %>%
mutate(HasBonus = !is.na(MiscFeature)) %>%
select(-MiscFeature) %>%
mutate(Alley = ifelse(is.na(Alley), 0, ifelse(Alley == 'Grvl', 1, 2))) %>%
mutate(Fence = ifelse(Fence == 'GdPrv', 4, ifelse(Fence == 'MnPrv', 3, ifelse(Fence == 'GdWo', 2, ifelse(Fence == 'MnWw', 1, 0))))) %>%
mutate(Fence = replace_na(Fence, 0)) %>% # ifelse didn't seem to like commands like is.na()
mutate(FireplaceQu = case_when(FireplaceQu == 'Ex' ~ 5, FireplaceQu == 'Gd' ~ 4, FireplaceQu == 'TA' ~ 3, FireplaceQu == 'Fa' ~ 2, FireplaceQu == 'Po' ~ 1)) %>%
mutate(FireplaceQu = replace_na(FireplaceQu, 0)) %>%
mutate(LotFrontage = replace_na(LotFrontage, lotfront_med)) %>% # Can't use this one for Math 425
mutate(GarageType = fct_explicit_na(GarageType, 'None')) %>% # Useful little function
mutate(GarageFinish = case_when(GarageFinish == 'Fin' ~ 3, GarageFinish == 'Rfn' ~ 2, GarageFinish == 'Unf' ~ 1)) %>%
mutate(GarageFinish = replace_na(GarageFinish, 0)) %>%
mutate(GarageCond = case_when(GarageCond == 'Ex' | GarageCond == 'Gd' ~ 3, GarageCond == 'TA' ~ 2, GarageCond == 'Fa' | GarageCond == 'Po' ~ 1)) %>%
mutate(GarageCond = replace_na(GarageCond, 0)) %>%
mutate(GarageQual = case_when(GarageQual == 'Ex' | GarageQual == 'Gd' ~ 3, GarageQual == 'TA' ~ 2, GarageQual == 'Fa' | GarageQual == 'Po' ~ 1)) %>%
mutate(GarageQual = replace_na(GarageQual, 0)) %>%
mutate(MasVnrType = replace_na(MasVnrType, 'None')) %>% # This works because 'None' is already a factor
mutate(MasVnrArea = replace_na(MasVnrArea, 0)) %>%
select(-Electrical) # I'm not too worried.
# This method works, so why not?
house2$GarageYrBlt[is.na(house2$GarageYrBlt)] <- house2$YearBuilt[is.na(house2$GarageYrBlt)] # Having imputed those values, I can't use GarageYrBlt for a model in Math 425.
# At this rate I won't get it done, so unfortunately I have to move on to building a model.
#View(house2)
# Just checking
summary(lm(house$SalePrice ~ house$Id))
##
## Call:
## lm(formula = house$SalePrice ~ house$Id)
##
## Residuals:
## Min 1Q Median 3Q Max
## -146990 -51260 -18034 32712 573920
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 183937.935 4160.774 44.208 <2e-16 ***
## house$Id -4.130 4.934 -0.837 0.403
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 79450 on 1458 degrees of freedom
## Multiple R-squared: 0.0004803, Adjusted R-squared: -0.0002052
## F-statistic: 0.7007 on 1 and 1458 DF, p-value: 0.4027
# Sample, because this takes too long
pairs(sample_n(cbind(house2$SalePrice,house2[,2:10]), 300))
tibble(house2)
## # A tibble: 1,460 x 1
## house2$Id $MSSubClass $MSZoning $LotFrontage $LotArea $Street $Alley
## <int> <int> <fct> <int> <int> <fct> <dbl>
## 1 1 60 RL 65 8450 Pave 0
## 2 2 20 RL 80 9600 Pave 0
## 3 3 60 RL 68 11250 Pave 0
## 4 4 70 RL 60 9550 Pave 0
## 5 5 60 RL 84 14260 Pave 0
## 6 6 50 RL 85 14115 Pave 0
## 7 7 20 RL 75 10084 Pave 0
## 8 8 60 RL 69 10382 Pave 0
## 9 9 50 RM 51 6120 Pave 0
## 10 10 190 RL 50 7420 Pave 0
## # ... with 1,450 more rows, and 73 more variables: $LotShape <fct>,
## # $LandContour <fct>, $Utilities <fct>, $LotConfig <fct>,
## # $LandSlope <fct>, $Neighborhood <fct>, $Condition1 <fct>,
## # $Condition2 <fct>, $BldgType <fct>, $HouseStyle <fct>,
## # $OverallQual <int>, $OverallCond <int>, $YearBuilt <int>,
## # $YearRemodAdd <int>, $RoofStyle <fct>, $RoofMatl <fct>,
## # $Exterior1st <fct>, $Exterior2nd <fct>, $MasVnrType <fct>,
## # $MasVnrArea <dbl>, $ExterQual <fct>, $ExterCond <fct>,
## # $Foundation <fct>, $BsmtQual <fct>, $BsmtCond <fct>,
## # $BsmtExposure <fct>, $BsmtFinType1 <fct>, $BsmtFinSF1 <int>,
## # $BsmtFinType2 <fct>, $BsmtFinSF2 <int>, $BsmtUnfSF <int>,
## # $TotalBsmtSF <int>, $Heating <fct>, $HeatingQC <fct>,
## # $CentralAir <fct>, $X1stFlrSF <int>, $X2ndFlrSF <int>,
## # $LowQualFinSF <int>, $GrLivArea <int>, $BsmtFullBath <int>,
## # $BsmtHalfBath <int>, $FullBath <int>, $HalfBath <int>,
## # $BedroomAbvGr <int>, $KitchenAbvGr <int>, $KitchenQual <fct>,
## # $TotRmsAbvGrd <int>, $Functional <fct>, $Fireplaces <int>,
## # $FireplaceQu <dbl>, $GarageType <fct>, $GarageYrBlt <int>,
## # $GarageFinish <dbl>, $GarageCars <int>, $GarageArea <int>,
## # $GarageQual <dbl>, $GarageCond <dbl>, $PavedDrive <fct>,
## # $WoodDeckSF <int>, $OpenPorchSF <int>, $EnclosedPorch <int>,
## # $X3SsnPorch <int>, $ScreenPorch <int>, $PoolArea <int>, $Fence <dbl>,
## # $MiscVal <int>, $MoSold <int>, $YrSold <int>, $SaleType <fct>,
## # $SaleCondition <fct>, $SalePrice <int>, $HasPool <lgl>,
## # $HasBonus <lgl>
#View(abs(cor(house))[,81])
plot(log(SalePrice) ~ Street, data = house2)# Street is worth keeping.
plot(log(SalePrice) ~ YearBuilt, col = factor(Alley), data = house2, pch = 16)
plot(log(SalePrice) ~ Alley, data = house2)
plot(log(SalePrice) ~ LotShape, data = house) # There's a trend, but I'm going to ignore it.
plot(log(SalePrice) ~ YearBuilt, data = house2, col = MSZoning)
house2 %>%
ggplot(aes(y = log(SalePrice), x = YearBuilt, col = MSZoning)) +
geom_point() +
facet_wrap(house2$MSZoning)
# This also takes too long. I can't do this sort of thing with all of the columns
colnames(house2)
## [1] "Id" "MSSubClass" "MSZoning" "LotFrontage"
## [5] "LotArea" "Street" "Alley" "LotShape"
## [9] "LandContour" "Utilities" "LotConfig" "LandSlope"
## [13] "Neighborhood" "Condition1" "Condition2" "BldgType"
## [17] "HouseStyle" "OverallQual" "OverallCond" "YearBuilt"
## [21] "YearRemodAdd" "RoofStyle" "RoofMatl" "Exterior1st"
## [25] "Exterior2nd" "MasVnrType" "MasVnrArea" "ExterQual"
## [29] "ExterCond" "Foundation" "BsmtQual" "BsmtCond"
## [33] "BsmtExposure" "BsmtFinType1" "BsmtFinSF1" "BsmtFinType2"
## [37] "BsmtFinSF2" "BsmtUnfSF" "TotalBsmtSF" "Heating"
## [41] "HeatingQC" "CentralAir" "X1stFlrSF" "X2ndFlrSF"
## [45] "LowQualFinSF" "GrLivArea" "BsmtFullBath" "BsmtHalfBath"
## [49] "FullBath" "HalfBath" "BedroomAbvGr" "KitchenAbvGr"
## [53] "KitchenQual" "TotRmsAbvGrd" "Functional" "Fireplaces"
## [57] "FireplaceQu" "GarageType" "GarageYrBlt" "GarageFinish"
## [61] "GarageCars" "GarageArea" "GarageQual" "GarageCond"
## [65] "PavedDrive" "WoodDeckSF" "OpenPorchSF" "EnclosedPorch"
## [69] "X3SsnPorch" "ScreenPorch" "PoolArea" "Fence"
## [73] "MiscVal" "MoSold" "YrSold" "SaleType"
## [77] "SaleCondition" "SalePrice" "HasPool" "HasBonus"
pairs(sample_n(cbind(log(house2$SalePrice),house2[,2:10]), 200))
# I'll take LotArea. #5
pairs(sample_n(cbind(log(house2$SalePrice),house2[,11:20]), 200))
# OverallQual, YearBuilt #18, 20
pairs(sample_n(cbind(log(house2$SalePrice),house2[,21:30]), 200))
# maybe YearRemodAdd, maybe MasVnrArea #21, 27
pairs(sample_n(cbind(log(house2$SalePrice),house2[,31:40]), 200))
# TotalBsmtSF #39
pairs(sample_n(cbind(log(house2$SalePrice),house2[,41:50]), 200))
# X1stFlrSF, X2ndFlrSF, GrLivArea #43, 44, 46
pairs(sample_n(cbind(log(house2$SalePrice),house2[,51:60]), 200))
# TotRmsAbvGrd #54
pairs(sample_n(cbind(log(house2$SalePrice),house2[,61:70]), 200))
# GarageArea #62
pairs(sample_n(cbind(log(house2$SalePrice),house2[,71:80]), 200))
# HasPool, HasBonus #79, 80 'table(house2$HasPool)': There are only 7 houses with a pool.
summary(lm(log(SalePrice) ~ LotArea, data = house2))
# adj_r2 = 0.06557
summary(lm(log(SalePrice) ~ YearBuilt, data = house2))
# Adjusted R-squared: 0.3436
summary(lm(log(SalePrice) ~ LotArea + YearBuilt, data = house2))
# adj_r2 = 0.4053
summary(lm(log(SalePrice) ~ YearBuilt + LotArea:YearBuilt, data = house2))
# adj_r2 = 0.4054
summary(lm(log(SalePrice) ~ LotArea + YearBuilt + LotArea:YearBuilt, data = house2))
# adj_r2 = 0.4056 #these hardly increase
summary(lm(log(SalePrice) ~ LotArea + YearBuilt + OverallQual, data = house2))
# Adjusted R-squared: 0.7208
summary(lm(log(SalePrice) ~ LotArea + YearBuilt + YearBuilt:OverallQual, data = house2))
# Adjusted R-squared: 0.7207
summary(lm(log(SalePrice) ~ LotArea + YearBuilt + OverallQual + TotalBsmtSF, data = house2))
# Adjusted R-squared: 0.7425
summary(lm(log(SalePrice) ~ LotArea + YearBuilt + OverallQual + TotalBsmtSF + X1stFlrSF, data = house2))
# Adjusted R-squared: 0.757
summary(lm(log(SalePrice) ~ LotArea + YearBuilt + OverallQual + TotalBsmtSF + X1stFlrSF + X2ndFlrSF, data = house2))
# Adjusted R-squared: 0.8055
summary(lm(log(SalePrice) ~ LotArea + YearBuilt + OverallQual + TotalBsmtSF + X1stFlrSF + X2ndFlrSF + GrLivArea, data = house2))
# Adjusted R-squared: 0.8054 That didn't increase it and I think because this is a lot of the same information.
summary(lm(log(SalePrice) ~ LotArea + YearBuilt + OverallQual + TotalBsmtSF + X1stFlrSF + X2ndFlrSF + X1stFlrSF:X2ndFlrSF, data = house2))
# Adjusted R-squared: 0.8111 I might keep that.
summary(lm(log(SalePrice) ~ LotArea + YearBuilt + OverallQual + TotalBsmtSF + X1stFlrSF + X2ndFlrSF + X1stFlrSF:X2ndFlrSF + GarageArea, data = house2))
# Adjusted R-squared: 0.82
summary(lm(log(SalePrice) ~ LotArea + YearBuilt + OverallQual + TotalBsmtSF + X1stFlrSF + X2ndFlrSF + X1stFlrSF:X2ndFlrSF + GarageArea + YearRemodAdd, data = house2))
# Adjusted R-squared: 0.8276
summary(lm(log(SalePrice) ~ LotArea + YearBuilt + OverallQual + TotalBsmtSF + X1stFlrSF + X2ndFlrSF + X1stFlrSF:X2ndFlrSF + GarageArea + YearRemodAdd + MasVnrArea, data = house2))
# Adjusted R-squared: 0.8276 # No increase
summary(lm(log(SalePrice) ~ MasVnrArea, data = house2))
# let's pause to check some of this:
## Not using this:
# adj_r2s <- rep(0, 12)
# i <- 1
#
# for (attribute in c(log(house2$SalePrice), house2$LotArea, house2$YearBuilt, house2$OverallQual, house2$TotalBsmtSF, house2$X1stFlrSF, house2$X2ndFlrSF, house2$GarageArea, house2$YearRemodAdd, house2$MasVnrArea, house2$HasPool, house2$HasBonus)) {
# lm0 <- lm(log(house2$SalePrice) ~ attribute)
# adj_r2s[i] <-
#
# }
#View(cbind(Attribute = c("log(house2$SalePrice)", "house2$LotArea", "house2$YearBuilt", "house2$OverallQual", "house2$TotalBsmtSF", "house2$X1stFlrSF", "house2$X2ndFlrSF", "house2$GarageArea", "house2$YearRemodAdd", "house2$MasVnrArea", "house2$HasPool", "house2$HasBonus"), '|Correlation|' = abs(cor(cbind(log(house2$SalePrice), house2$LotArea, house2$YearBuilt, house2$OverallQual, house2$TotalBsmtSF, house2$X1stFlrSF, house2$X2ndFlrSF, house2$GarageArea, house2$YearRemodAdd, house2$MasVnrArea, house2$HasPool, house2$HasBonus)))[,1]))
house3 <- data.frame(LogPrice = log(house2$SalePrice), Lot = house2$LotArea, Year = house2$YearBuilt, Quality = house2$OverallQual, Basement = house2$TotalBsmtSF, X1stFlr = house2$X1stFlrSF, X2ndFlr = house2$X2ndFlrSF, Garage = house2$GarageArea, Remodel = house2$YearRemodAdd, Masonry = house2$MasVnrArea, Pool = house2$HasPool, Bonus = house2$HasBonus)
# pairs(house3)
house3 <- cbind(house3, MasonType = house2$MasVnrType) %>%
mutate(Stone = MasonType == 'Stone') %>%
mutate(BrkCmn = MasonType == 'BrkCmn') %>%
mutate(BrkFace = MasonType == 'BrkFace') %>%
mutate(MasonType = case_when(MasonType == 'None' ~ 0, MasonType == 'BrkCmn' ~ 1, MasonType == 'BrkFace' ~ 2, MasonType == 'Stone' ~ 3))
pairs(sample_n(house3, 250), col = factor(house3$MasonType))
summary(lm(LogPrice ~ Lot + Year + Quality + Basement + X1stFlr + X2ndFlr + X1stFlr:X2ndFlr + Garage + Remodel + Stone:Masonry, data = house3)) # The Masonry significance levels aren't that high.
summary(lm(LogPrice ~ Lot + Year + Quality + Basement + X1stFlr + X2ndFlr + X1stFlr:X2ndFlr + Garage + Remodel + Stone:Masonry + Pool + Bonus, data = house3))
summary(lm(LogPrice ~ Lot + Year + Quality + Basement + X1stFlr + X2ndFlr + X1stFlr:X2ndFlr + Garage + Remodel + Stone:Masonry + Pool, data = house3))
summary(lm(LogPrice ~ Lot + Year + Quality + Basement + X1stFlr + X2ndFlr + X1stFlr:X2ndFlr + Garage + Remodel + Pool, data = house3)) # Best score of all of these
summary(lm(LogPrice ~ Year + Quality + Basement + X1stFlr + X2ndFlr + X1stFlr:X2ndFlr + Garage + Remodel + Pool, data = house3)) # Nope. It's better with Lot.
# A couple more things though:
summary(lm(LogPrice ~ Year + Quality + Basement + X1stFlr + X2ndFlr + X1stFlr:X2ndFlr + Garage + Remodel + Pool + Masonry:MasonType, data = house3))
summary(lm(LogPrice ~ Year + Quality + Basement + X1stFlr + X2ndFlr + Garage + Remodel + Pool, data = house3))
my_model <- lm(LogPrice ~ Lot + Year + Quality + Basement + X1stFlr + X2ndFlr + X1stFlr:X2ndFlr + Garage + Remodel + Pool, data = house3)
summary(my_model)
##
## Call:
## lm(formula = LogPrice ~ Lot + Year + Quality + Basement + X1stFlr +
## X2ndFlr + X1stFlr:X2ndFlr + Garage + Remodel + Pool, data = house3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.98555 -0.07294 0.00885 0.08860 0.59656
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.712e+00 5.244e-01 5.171 2.65e-07 ***
## Lot 3.383e-06 4.637e-07 7.296 4.86e-13 ***
## Year 1.829e-03 2.054e-04 8.908 < 2e-16 ***
## Quality 9.672e-02 5.200e-03 18.602 < 2e-16 ***
## Basement 8.897e-05 1.870e-05 4.759 2.14e-06 ***
## X1stFlr 3.252e-04 2.292e-05 14.193 < 2e-16 ***
## X2ndFlr 4.198e-04 3.454e-05 12.155 < 2e-16 ***
## Garage 2.334e-04 2.693e-05 8.667 < 2e-16 ***
## Remodel 2.222e-03 2.760e-04 8.048 1.73e-15 ***
## PoolTRUE -1.829e-01 6.627e-02 -2.759 0.00587 **
## X1stFlr:X2ndFlr -1.519e-07 2.564e-08 -5.925 3.90e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1655 on 1449 degrees of freedom
## Multiple R-squared: 0.8296, Adjusted R-squared: 0.8284
## F-statistic: 705.3 on 10 and 1449 DF, p-value: < 2.2e-16
plot(my_model, which = 1)
lot_lm <- lm(LogPrice ~ Lot, data = house3)
plot(lot_lm, which = 1)
boxCox(lot_lm)
lot_lm2 <- lm(LogPrice ~ log(Lot), data = house3)
plot(lot_lm2, which = 1)
lot_lm3 <- lm(LogPrice ~ I(Lot^2), data = house3)
plot(lot_lm3, which = 1)
# I like Logging it better
summary(lot_lm2)
##
## Call:
## lm(formula = LogPrice ~ log(Lot), data = house3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.55029 -0.23028 -0.02586 0.24477 1.32863
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.21133 0.16910 54.47 <2e-16 ***
## log(Lot) 0.30872 0.01853 16.66 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3662 on 1458 degrees of freedom
## Multiple R-squared: 0.1599, Adjusted R-squared: 0.1594
## F-statistic: 277.6 on 1 and 1458 DF, p-value: < 2.2e-16
my_model2 <- lm(LogPrice ~ log(Lot) + Year + Quality + Basement + X1stFlr + X2ndFlr + X1stFlr:X2ndFlr + Garage + Remodel + Pool, data = house3)
summary(my_model2) # Yeah, that's more like it.
##
## Call:
## lm(formula = LogPrice ~ log(Lot) + Year + Quality + Basement +
## X1stFlr + X2ndFlr + X1stFlr:X2ndFlr + Garage + Remodel +
## Pool, data = house3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.85259 -0.07168 0.01186 0.08409 0.61679
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.172e+00 5.244e-01 2.234 0.02562 *
## log(Lot) 1.206e-01 9.485e-03 12.711 < 2e-16 ***
## Year 1.992e-03 1.989e-04 10.015 < 2e-16 ***
## Quality 1.003e-01 5.032e-03 19.933 < 2e-16 ***
## Basement 9.104e-05 1.802e-05 5.052 4.92e-07 ***
## X1stFlr 2.744e-04 2.268e-05 12.100 < 2e-16 ***
## X2ndFlr 4.000e-04 3.341e-05 11.972 < 2e-16 ***
## Garage 1.871e-04 2.635e-05 7.099 1.96e-12 ***
## Remodel 2.332e-03 2.668e-04 8.740 < 2e-16 ***
## PoolTRUE -1.770e-01 6.399e-02 -2.766 0.00575 **
## X1stFlr:X2ndFlr -1.499e-07 2.477e-08 -6.053 1.81e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1598 on 1449 degrees of freedom
## Multiple R-squared: 0.841, Adjusted R-squared: 0.8399
## F-statistic: 766.6 on 10 and 1449 DF, p-value: < 2.2e-16
# I realize I didn't explore with a sample of the data, but I can still test by creating the model from a smaller sample.
training_rows <- sample(1:length(house3$LogPrice), length(house3$LogPrice)*0.8)
training_data <- house3[training_rows,]
training_data$Price <- exp(training_data$LogPrice) # I had logged the prices, after all. This is y_hat_i
testing_data <- house3[-training_rows,]
testing_data$Price <- exp(testing_data$LogPrice)
final_model <- lm(log(Price) ~ log(Lot) + Year + Quality + Basement + X1stFlr + X2ndFlr + X1stFlr:X2ndFlr + Garage + Remodel + Pool, data = training_data)
summary(final_model)
##
## Call:
## lm(formula = log(Price) ~ log(Lot) + Year + Quality + Basement +
## X1stFlr + X2ndFlr + X1stFlr:X2ndFlr + Garage + Remodel +
## Pool, data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.73879 -0.07425 0.01211 0.08452 0.69049
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.190e+00 6.019e-01 1.978 0.048186 *
## log(Lot) 1.257e-01 1.097e-02 11.457 < 2e-16 ***
## Year 2.048e-03 2.280e-04 8.985 < 2e-16 ***
## Quality 1.024e-01 5.763e-03 17.774 < 2e-16 ***
## Basement 7.934e-05 2.057e-05 3.857 0.000121 ***
## X1stFlr 2.740e-04 2.597e-05 10.553 < 2e-16 ***
## X2ndFlr 4.127e-04 3.764e-05 10.964 < 2e-16 ***
## Garage 1.957e-04 2.973e-05 6.581 7.07e-11 ***
## Remodel 2.242e-03 3.053e-04 7.341 3.99e-13 ***
## PoolTRUE -2.130e-01 7.922e-02 -2.689 0.007265 **
## X1stFlr:X2ndFlr -1.624e-07 2.764e-08 -5.876 5.47e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1631 on 1157 degrees of freedom
## Multiple R-squared: 0.839, Adjusted R-squared: 0.8376
## F-statistic: 603.1 on 10 and 1157 DF, p-value: < 2.2e-16
predictions <- exp(predict.lm(final_model, new_data = testing_data)) # I must be doing something wrong here.
mean_price <- mean(testing_data$Price) # y_bar
targets <- testing_data$Price # y_i
SSE <- sum(predictions - targets)^2 # Sum of squared residuals
SSTO <- sum(targets - mean_price)^2 # Total sum of squares
DF <- length(targets) - length(final_model$coefficients) # Degrees of freedom
print("Adjusted R-Squared:")
## [1] "Adjusted R-Squared:"
(Adjusted_R_Squared <- 1 - (length(targets) - 1)/DF * SSE/SSTO)
## [1] -2.471374e+30
> predict(final_model, data.frame(training_data[2,]))
1343
12.52126
> exp(predict(final_model, data.frame(training_data[2,])))
1343
274103.6
> training_data[2,]
LogPrice Lot Year Quality Basement X1stFlr X2ndFlr Garage Remodel Masonry Pool Bonus MasonType Stone BrkCmn BrkFace
1343 12.33929 9375 2002 8 1284 1284 885 647 2002 149 FALSE FALSE 2 FALSE FALSE TRUE Price 1343 228500
> predict(final_model, data.frame(testing_data[2,]))
11
11.75375
> exp(predict(final_model, data.frame(testing_data[2,])))
11
127230
> testing_data[2,]
LogPrice Lot Year Quality Basement X1stFlr X2ndFlr Garage Remodel Masonry Pool Bonus MasonType Stone BrkCmn BrkFace
11 11.77144 11200 1965 5 1040 1040 0 384 1965 0 FALSE FALSE 0 FALSE FALSE FALSE
Price
11 129500
> sum(predictions - targets)^2
[1] 5.036271e+13
> sum((predictions - targets)^2)
[1] 1.448868e+13
mean_price <- mean(testing_data$Price) # y_bar
targets <- testing_data$Price # y_i
SSE <- sum((predictions - targets)^2) # Sum of squared residuals
SSTO <- sum((targets - mean_price)^2) # Total sum of squares
DF <- length(targets) - length(final_model$coefficients) # Degrees of freedom
print("Adjusted R-Squared:")
## [1] "Adjusted R-Squared:"
(Adjusted_R_Squared <- 1 - (length(targets) - 1)/DF * SSE/SSTO)
## [1] -8.288766
print("R-Squared:")
## [1] "R-Squared:"
(Adjusted_R_Squared <- 1 - SSE/SSTO)
## [1] -7.969565
new_final_model <- lm(log(Price) ~ log(Lot) + Year + Quality + Basement + X1stFlr + Garage + X2ndFlr, data = training_data)
summary(new_final_model)
##
## Call:
## lm(formula = log(Price) ~ log(Lot) + Year + Quality + Basement +
## X1stFlr + Garage + X2ndFlr, data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.17717 -0.07435 0.01223 0.08916 0.52071
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.078e+00 4.378e-01 9.316 < 2e-16 ***
## log(Lot) 1.231e-01 1.145e-02 10.751 < 2e-16 ***
## Year 2.856e-03 2.182e-04 13.094 < 2e-16 ***
## Quality 1.192e-01 5.783e-03 20.614 < 2e-16 ***
## Basement 5.568e-05 2.133e-05 2.610 0.00917 **
## X1stFlr 2.078e-04 2.482e-05 8.373 < 2e-16 ***
## Garage 2.027e-04 3.107e-05 6.523 1.03e-10 ***
## X2ndFlr 1.996e-04 1.381e-05 14.460 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1706 on 1160 degrees of freedom
## Multiple R-squared: 0.8234, Adjusted R-squared: 0.8224
## F-statistic: 772.8 on 7 and 1160 DF, p-value: < 2.2e-16
predictions <- exp(predict.lm(final_model, new_data = testing_data))
mean_price <- mean(testing_data$Price) # y_bar
targets <- testing_data$Price # y_i
SSE <- sum((predictions - targets)^2) # Sum of squared residuals
SSTO <- sum((targets - mean_price)^2) # Total sum of squares
DF <- length(targets) - length(final_model$coefficients) # Degrees of freedom
print("Adjusted R-Squared:")
## [1] "Adjusted R-Squared:"
(Adjusted_R_Squared <- 1 - (length(targets) - 1)/DF * SSE/SSTO)
## [1] -8.288766
new_final_model <- lm(log(Price) ~ Year, data = training_data)
summary(new_final_model)
##
## Call:
## lm(formula = log(Price) ~ Year, data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.24874 -0.19009 -0.03155 0.16857 1.67141
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.5983295 0.6227112 -5.778 9.66e-09 ***
## Year 0.0079271 0.0003158 25.100 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3262 on 1166 degrees of freedom
## Multiple R-squared: 0.3508, Adjusted R-squared: 0.3502
## F-statistic: 630 on 1 and 1166 DF, p-value: < 2.2e-16
predictions <- exp(predict.lm(final_model, new_data = data.frame(testing_data)))
mean_price <- mean(testing_data$Price) # y_bar
targets <- testing_data$Price # y_i
SSE <- sum((predictions - targets)^2) # Sum of squared residuals
SSTO <- sum((targets - mean_price)^2) # Total sum of squares
DF <- length(targets) - length(final_model$coefficients) # Degrees of freedom
print("Adjusted R-Squared:")
## [1] "Adjusted R-Squared:"
(Adjusted_R_Squared <- 1 - (length(targets) - 1)/DF * SSE/SSTO)
## [1] -8.288766
smaller_data <- sample_n(house3, 400)
plot(LogPrice ~ Year, data = smaller_data)
abline(lm(LogPrice ~ Year, data = house3))
plot(LogPrice ~ Quality, data = smaller_data)
abline(lm(LogPrice ~ Quality, data = house3))
plot(LogPrice ~ Basement, data = smaller_data)
abline(lm(LogPrice ~ Basement, data = house3))
plot(LogPrice ~ X1stFlr, data = smaller_data)
abline(lm(LogPrice ~ X1stFlr, data = house3))
plot(LogPrice ~ X2ndFlr, data = smaller_data)
abline(lm(LogPrice ~ X2ndFlr, data = house3))
library(plotly) # Code for 3D plot below coppied from notebook
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
#Perform the multiple regression
three_d_lm <- lm(LogPrice ~ X1stFlr + X2ndFlr + X1stFlr:X2ndFlr, data = house3)
#Graph Resolution (more important for more complex shapes)
graph_reso <- 20
#Setup Axis
axis_x <- seq(min(smaller_data$X1stFlr), max(smaller_data$X1stFlr), by = graph_reso)
axis_y <- seq(min(smaller_data$X2ndFlr), max(smaller_data$X2ndFlr), by = graph_reso)
#Sample points
house_surface <- expand.grid(X1stFlr = axis_x, X2ndFlr = axis_y, KEEP.OUT.ATTRS = F)
house_surface$Z <- predict.lm(three_d_lm, newdata = house_surface)
house_surface <- acast(house_surface, X2ndFlr ~ X1stFlr, value.var = "Z") #y ~ x
#Create scatterplot
plot_ly(smaller_data,
x = ~X1stFlr,
y = ~X2ndFlr,
z = ~LogPrice,
text = rownames(smaller_data),
type = "scatter3d",
mode = "markers") %>%
add_trace(z = house_surface,
x = axis_x,
y = axis_y,
type = "surface")
## Warning: 'surface' objects don't have these attributes: 'mode'
## Valid attributes include:
## 'type', 'visible', 'showlegend', 'legendgroup', 'opacity', 'name', 'uid', 'ids', 'customdata', 'selectedpoints', 'hoverinfo', 'hoverlabel', 'stream', 'transforms', 'z', 'x', 'y', 'text', 'surfacecolor', 'cauto', 'cmin', 'cmax', 'colorscale', 'autocolorscale', 'reversescale', 'showscale', 'colorbar', 'contours', 'hidesurface', 'lightposition', 'lighting', '_deprecated', 'xcalendar', 'ycalendar', 'zcalendar', 'scene', 'idssrc', 'customdatasrc', 'hoverinfosrc', 'zsrc', 'xsrc', 'ysrc', 'textsrc', 'surfacecolorsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
plot(LogPrice ~ Garage, data = smaller_data)
abline(lm(LogPrice ~ Garage, data = house3))
plot(LogPrice ~ Remodel, data = smaller_data)
abline(lm(LogPrice ~ Remodel, data = house3))
palette(c("firebrick", "skyblue"))
plot(LogPrice ~ Year, data = house3, col = factor(house3$Pool), pch = 16, cex = 0.5, main = "Blue dots mean the house has a pool")
pool_lm <- lm(LogPrice ~ Year + Pool, data = house3)
abline(pool_lm$coef[1], pool_lm$coef[2], lwd = 2, col = "firebrick")
abline(pool_lm$coef[1] + pool_lm$coef[3], pool_lm$coef[2], lwd = 2, col = "skyblue")
# my_model2 <- lm(LogPrice ~ log(Lot) + Year + Quality + Basement + X1stFlr + X2ndFlr + X1stFlr:X2ndFlr + Garage + Remodel + Pool, data = house3)